Train convolutional network for sentiment analysis.

Based on "Convolutional Neural Networks for Sentence Classification" by Yoon Kim http://arxiv.org/pdf/1408.5882v2.pdf

For CNN-non-static gets to 82.1% after 61 epochs with following settings: embedding_dim = 20
filter_sizes = (3, 4) num_filters = 3 dropout_prob = (0.7, 0.8) hidden_dims = 100

For CNN-rand gets to 78-79% after 7-8 epochs with following settings: embedding_dim = 20
filter_sizes = (3, 4) num_filters = 150 dropout_prob = (0.25, 0.5) hidden_dims = 150

For CNN-static gets to 75.4% after 7 epochs with following settings: embedding_dim = 100
filter_sizes = (3, 4) num_filters = 150 dropout_prob = (0.25, 0.5) hidden_dims = 150

  • it turns out that such a small data set as "Movie reviews with one sentence per review" (Pang and Lee, 2005) requires much smaller network than the one introduced in the original article:
  • embedding dimension is only 20 (instead of 300; 'CNN-static' still requires ~100)
  • 2 filter sizes (instead of 3)
  • higher dropout probabilities and
  • 3 filters per filter size is enough for 'CNN-non-static' (instead of 100)
  • embedding initialization does not require prebuilt Google Word2Vec data. Training Word2Vec on the same "Movie reviews" data set is enough to achieve performance reported in the article (81.6%)

Another distinct difference is sliding MaxPooling window of length=2 instead of MaxPooling over whole feature map as in the article


In [ ]:
import numpy as np
import data_helpers
from w2v import train_word2vec

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D

from sklearn.cross_validation import train_test_split

np.random.seed(2)

In [ ]:
model_variation = 'CNN-rand'  #  CNN-rand | CNN-non-static | CNN-static
print('Model variation is %s' % model_variation)

In [ ]:
# Model Hyperparameters
sequence_length = 56
embedding_dim = 20          
filter_sizes = (3, 4)
num_filters = 150
dropout_prob = (0.25, 0.5)
hidden_dims = 150

In [ ]:
# Training parameters
batch_size = 32
num_epochs = 2

In [ ]:
# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count                        
context = 10        # Context window size

In [ ]:
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()

In [ ]:
if model_variation=='CNN-non-static' or model_variation=='CNN-static':
    embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim, min_word_count, context)
    if model_variation=='CNN-static':
        x = embedding_weights[0][x]
elif model_variation=='CNN-rand':
    embedding_weights = None
else:
    raise ValueError('Unknown model variation')

In [ ]:
data = np.append(x,y,axis = 1)

In [ ]:
train, test = train_test_split(data, test_size = 0.15,random_state = 0)

In [ ]:
X_test = test[:,:56]
Y_test = test[:,56:58]


X_train = train[:,:56]
Y_train = train[:,56:58]
train_rows = np.random.randint(0,X_train.shape[0],2500)
X_train = X_train[train_rows]
Y_train = Y_train[train_rows]

In [ ]:
print("Vocabulary Size: {:d}".format(len(vocabulary)))

In [ ]:
def initialize():
    
    global graph_in
    global convs
    
    graph_in = Input(shape=(sequence_length, embedding_dim))
    convs = []

In [ ]:
#Buliding the first layer (Convolution Layer) of the network
def build_layer_1(filter_length):
    
   
    conv = Convolution1D(nb_filter=num_filters,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    return conv

In [ ]:
#Adding a max pooling layer to the model(network)
def add_max_pooling(conv):
    
    pool = MaxPooling1D(pool_length=2)(conv)
    return pool

In [ ]:
#Adding a flattening layer to the model(network), before adding a dense layer
def add_flatten(conv_or_pool):
    
    flatten = Flatten()(conv_or_pool)
    return flatten

In [ ]:
def add_sequential(graph):
    
    #main sequential model
    model = Sequential()
    if not model_variation=='CNN-static':
        model.add(Embedding(len(vocabulary), embedding_dim, input_length=sequence_length,
                        weights=embedding_weights))
    model.add(Dropout(dropout_prob[0], input_shape=(sequence_length, embedding_dim)))
    model.add(graph)
    model.add(Dense(2))
    model.add(Activation('sigmoid'))
    
    return model

In [ ]:
#1.Convolution 2.Flatten
def one_layer_convolution():
    
    initialize()
    
    conv = build_layer_1(3)
    flatten = add_flatten(conv)
    
    convs.append(flatten)
    out = convs[0]

    graph = Model(input=graph_in, output=out)
    
    model = add_sequential(graph)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    model.fit(X_train, Y_train, batch_size=batch_size,
          nb_epoch=1, validation_data=(X_test, Y_test))
    
    score = model.evaluate(X_test, Y_test, verbose=0)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

In [ ]:
#1.Convolution 2.Max Pooling 3.Flatten
def two_layer_convolution():
    
    initialize()
    
    conv = build_layer_1(3)
    pool = add_max_pooling(conv)
    flatten = add_flatten(pool)
    
    convs.append(flatten)
    out = convs[0]

    graph = Model(input=graph_in, output=out)
    
    model = add_sequential(graph)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    model.fit(X_train, Y_train, batch_size=batch_size,
          nb_epoch=num_epochs, validation_data=(X_test, Y_test))
    
    score = model.evaluate(X_test, Y_test, verbose=0)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

In [ ]:
#1.Convolution 2.Max Pooling 3.Flatten 4.Convolution 5.Flatten
def three_layer_convolution():
    
    initialize()
    
    conv = build_layer_1(3)
    pool = add_max_pooling(conv)
    flatten = add_flatten(pool)
    
    convs.append(flatten)
    
    conv = build_layer_1(4)
    flatten = add_flatten(conv)
    
    convs.append(flatten)
    
    if len(filter_sizes)>1:
        out = Merge(mode='concat')(convs)
    else:
        out = convs[0]

    graph = Model(input=graph_in, output=out)
    
    model = add_sequential(graph)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    model.fit(X_train, Y_train, batch_size=batch_size,
          nb_epoch=1, validation_data=(X_test, Y_test))
    
    score = model.evaluate(X_test, Y_test, verbose=0)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

In [ ]:
#1.Convolution 2.Max Pooling 3.Flatten 4.Convolution 5.Max Pooling 6.Flatten
def four_layer_convolution():
    
    initialize()
    
    conv = build_layer_1(3)
    pool = add_max_pooling(conv)
    flatten = add_flatten(pool)
    
    convs.append(flatten)
    
    conv = build_layer_1(4)
    pool = add_max_pooling(conv)
    flatten = add_flatten(pool)
    
    convs.append(flatten)
    
    if len(filter_sizes)>1:
        out = Merge(mode='concat')(convs)
    else:
        out = convs[0]

    graph = Model(input=graph_in, output=out)
    
    model = add_sequential(graph)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    model.fit(X_train, Y_train, batch_size=batch_size,
          nb_epoch=num_epochs, validation_data=(X_test, Y_test))
    
    score = model.evaluate(X_test, Y_test, verbose=0)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

In [ ]:
%%time
#1.Convolution 2.Flatten
one_layer_convolution()

In [ ]:
%%time
#1.Convolution 2.Max Pooling 3.Flatten
two_layer_convolution()

In [ ]:
%%time
#1.Convolution 2.Max Pooling 3.Flatten 4.Convolution 5.Flatten
three_layer_convolution()

In [ ]:
%%time
#1.Convolution 2.Max Pooling 3.Flatten 4.Convolution 5.Max Pooling 6.Flatten
four_layer_convolution()